import re
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly.graph_objects as go
from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
data = pd.read_csv("C:\\Users\\lenovo\\Desktop\\r\\preprocessed_kindle_review .csv")
df=data
df
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 5 | This book was the very first bookmobile book I... | 50 + years ago... |
| 1 | 1 | 1 | When I read the description for this book, I c... | Boring! Boring! Boring! |
| 2 | 2 | 5 | I just had to edit this review. This book is a... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 5 | I don't normally buy 'mystery' novels because ... | Very good read. |
| 4 | 4 | 5 | This isn't the kind of book I normally read, a... | Great Story! |
| ... | ... | ... | ... | ... |
| 11995 | 11995 | 2 | Had to read certain passages twice--typos. Wi... | Where's the meat? |
| 11996 | 11996 | 3 | Not what i expected. yet a very interesting bo... | Interesting |
| 11997 | 11997 | 5 | Dragon Knights is a world where Knights ride d... | Dragon Knights, Wings of Change (I Dream of Dr... |
| 11998 | 11998 | 4 | Since this story is very short, it's hard to s... | Good writing, short story |
| 11999 | 11999 | 4 | from 1922 an amazing collection of info on sym... | interesting public domain book |
12000 rows × 4 columns
# printing dim of the data
data.shape
(12000, 4)
# displaying all the columns of the dataset
data.columns
Index(['Unnamed: 0', 'rating', 'reviewText', 'summary'], dtype='object')
# quick review of the dataset
data.head()
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 5 | This book was the very first bookmobile book I... | 50 + years ago... |
| 1 | 1 | 1 | When I read the description for this book, I c... | Boring! Boring! Boring! |
| 2 | 2 | 5 | I just had to edit this review. This book is a... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 5 | I don't normally buy 'mystery' novels because ... | Very good read. |
| 4 | 4 | 5 | This isn't the kind of book I normally read, a... | Great Story! |
# printing the first review from the dataset
data.reviewText[0]
'This book was the very first bookmobile book I bought when I was in the school book club. I loved the story then and I bet a dollar to a donut I will love it again. If my memory serves, I bought this book in 5th grade. That would have been about 1961. I am looking forward to reliving the memories.'
# value_counts() function returns object containing counts of unique values.
# The resulting object will be in descending order so that the first element is the most frequently-occurring element.
a=data.rating.value_counts()
a
5 3000 4 3000 1 2000 3 2000 2 2000 Name: rating, dtype: int64
# checking for null values
data.isnull().sum()
Unnamed: 0 0 rating 0 reviewText 0 summary 0 dtype: int64
# PLotting Rating histogram
data.rating.hist()
plt.title("Distribution of rating using Matplotlib")
plt.show()
# PLotting Rating using ploty
fig = go.Figure([go.Bar(x=a.index, y=a.values,text=a.values)])
fig.update_layout(title='Distribution of the Rating using ploty')
fig.show()
#Dropping columns that are not needed
data.columns
Index(['Unnamed: 0', 'rating', 'reviewText', 'summary'], dtype='object')
df = data.drop(['Unnamed: 0', 'summary'], axis=1)
df
| rating | reviewText | |
|---|---|---|
| 0 | 5 | This book was the very first bookmobile book I... |
| 1 | 1 | When I read the description for this book, I c... |
| 2 | 5 | I just had to edit this review. This book is a... |
| 3 | 5 | I don't normally buy 'mystery' novels because ... |
| 4 | 5 | This isn't the kind of book I normally read, a... |
| ... | ... | ... |
| 11995 | 2 | Had to read certain passages twice--typos. Wi... |
| 11996 | 3 | Not what i expected. yet a very interesting bo... |
| 11997 | 5 | Dragon Knights is a world where Knights ride d... |
| 11998 | 4 | Since this story is very short, it's hard to s... |
| 11999 | 4 | from 1922 an amazing collection of info on sym... |
12000 rows × 2 columns
# preview of the dataset
data.head()
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 5 | This book was the very first bookmobile book I... | 50 + years ago... |
| 1 | 1 | 1 | When I read the description for this book, I c... | Boring! Boring! Boring! |
| 2 | 2 | 5 | I just had to edit this review. This book is a... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 5 | I don't normally buy 'mystery' novels because ... | Very good read. |
| 4 | 4 | 5 | This isn't the kind of book I normally read, a... | Great Story! |
#converting rating to 0 and 1 from 1-5
# if rating is above 3 we will consider it as 1 else 0.
data["rating"] = data["rating"].apply(lambda x: 1 if x < 3 else 0) # positive as 0 and negative as 1
data
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 0 | This book was the very first bookmobile book I... | 50 + years ago... |
| 1 | 1 | 1 | When I read the description for this book, I c... | Boring! Boring! Boring! |
| 2 | 2 | 0 | I just had to edit this review. This book is a... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 0 | I don't normally buy 'mystery' novels because ... | Very good read. |
| 4 | 4 | 0 | This isn't the kind of book I normally read, a... | Great Story! |
| ... | ... | ... | ... | ... |
| 11995 | 11995 | 1 | Had to read certain passages twice--typos. Wi... | Where's the meat? |
| 11996 | 11996 | 0 | Not what i expected. yet a very interesting bo... | Interesting |
| 11997 | 11997 | 0 | Dragon Knights is a world where Knights ride d... | Dragon Knights, Wings of Change (I Dream of Dr... |
| 11998 | 11998 | 0 | Since this story is very short, it's hard to s... | Good writing, short story |
| 11999 | 11999 | 0 | from 1922 an amazing collection of info on sym... | interesting public domain book |
12000 rows × 4 columns
# lowering the text of the review
data["reviewText"] = data["reviewText"].str.lower()
data.head()
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 0 | this book was the very first bookmobile book i... | 50 + years ago... |
| 1 | 1 | 1 | when i read the description for this book, i c... | Boring! Boring! Boring! |
| 2 | 2 | 0 | i just had to edit this review. this book is a... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 0 | i don't normally buy 'mystery' novels because ... | Very good read. |
| 4 | 4 | 0 | this isn't the kind of book i normally read, a... | Great Story! |
# removing punctuation
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
"""custom function to remove the punctuation"""
return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
data["reviewText"] = data["reviewText"].apply(lambda text: remove_punctuation(text))
data.head()
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 0 | this book was the very first bookmobile book i... | 50 + years ago... |
| 1 | 1 | 1 | when i read the description for this book i co... | Boring! Boring! Boring! |
| 2 | 2 | 0 | i just had to edit this review this book is an... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 0 | i dont normally buy mystery novels because i j... | Very good read. |
| 4 | 4 | 0 | this isnt the kind of book i normally read alt... | Great Story! |
#removing stop words from the dataset
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
"""custom function to remove the stopwords"""
return " ".join([word for word in str(text).split() if word not in STOPWORDS])
data["reviewText"] = data["reviewText"].apply(lambda text: remove_stopwords(text))
data.head()
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 0 | book first bookmobile book bought school book ... | 50 + years ago... |
| 1 | 1 | 1 | read description book couldnt wait read downlo... | Boring! Boring! Boring! |
| 2 | 2 | 0 | edit review book believe got right updated rew... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 0 | dont normally buy mystery novels dont like how... | Very good read. |
| 4 | 4 | 0 | isnt kind book normally read although try limi... | Great Story! |
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
pos_tagged_text = nltk.pos_tag(text.split())
return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
data["reviewText"] = data["reviewText"].apply(lambda text: lemmatize_words(text))
data.head()
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 0 | book first bookmobile book buy school book clu... | 50 + years ago... |
| 1 | 1 | 1 | read description book couldnt wait read downlo... | Boring! Boring! Boring! |
| 2 | 2 | 0 | edit review book believe get right update rewr... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 0 | dont normally buy mystery novels dont like how... | Very good read. |
| 4 | 4 | 0 | isnt kind book normally read although try limi... | Great Story! |
# PLotting Rating histogram
data.rating.hist()
plt.title("Distribution of rating using Matplotlib")
plt.show()
# PLotting Rating using ploty
a = data.rating.value_counts()
fig = go.Figure([go.Bar(x=a.index, y=a.values,text=a.values)])
fig.update_layout(title='Distribution of the Rating using ploty')
fig.show()
#Plotting word cloud
text = " ".join(cat.split()[0] for cat in data.reviewText)
word_cloud = WordCloud(collocations = False, background_color = 'white').generate(text)
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()
from collections import Counter
cnt = Counter()
for text in data["reviewText"].values:
for word in text.split():
cnt[word] += 1
cnt.most_common(10)
[('book', 15391),
('story', 11022),
('read', 10022),
('like', 6207),
('one', 5947),
('character', 5670),
('get', 5451),
('love', 5057),
('good', 4761),
('would', 4088)]
cnt.get("book")
15391
cn = cnt.most_common(10)
w = []
c = []
for i in cn:
w.append(i[0])
c.append(i[1])
#Pie chart for Most Frequent Words
fig = px.pie(data, values=c, names=w, color_discrete_sequence=px.colors.sequential.RdBu)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(title="Most Frequent Words")
fig.show()
#Displaying Rarewords
n_rare_words = 11
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
RAREWORDS
{'1922',
'backgroung',
'don8216t',
'firedrake',
'gryphon',
'helos',
'insite',
'meaness',
'relm',
'symbols',
'twicetypos'}
# preview of data
data
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 0 | book first bookmobile book buy school book clu... | 50 + years ago... |
| 1 | 1 | 1 | read description book couldnt wait read downlo... | Boring! Boring! Boring! |
| 2 | 2 | 0 | edit review book believe get right update rewr... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 0 | dont normally buy mystery novels dont like how... | Very good read. |
| 4 | 4 | 0 | isnt kind book normally read although try limi... | Great Story! |
| ... | ... | ... | ... | ... |
| 11995 | 11995 | 1 | read certain passage twicetypos wish build rel... | Where's the meat? |
| 11996 | 11996 | 0 | expect yet interesting book usually don8216t r... | Interesting |
| 11997 | 11997 | 0 | dragon knight world knight ride dragon slay wi... | Dragon Knights, Wings of Change (I Dream of Dr... |
| 11998 | 11998 | 0 | since story short hard say much without give a... | Good writing, short story |
| 11999 | 11999 | 0 | 1922 amazing collection info symbols culture a... | interesting public domain book |
12000 rows × 4 columns
train, test = train_test_split(data, test_size = 0.3, stratify = data['rating'], random_state = 42)
cv= CountVectorizer(binary=True, min_df = 10, max_df = 0.95)
cv.fit_transform(train['reviewText'].values)
train_feature_set=cv.transform(train['reviewText'].values)
test_feature_set=cv.transform(test['reviewText'].values)
train_feature_set
<8400x3773 sparse matrix of type '<class 'numpy.int64'>' with 322893 stored elements in Compressed Sparse Row format>
train_feature_set.shape[1]
3773
cv.vocabulary_['book']
422
y_train = train['rating'].values
y_test = test['rating'].values
lr = LogisticRegression(random_state = 42, max_iter=1000)
lr.fit(train_feature_set,y_train)
y_pred = lr.predict(test_feature_set)
print("Accuracy: ",round(metrics.accuracy_score(y_test,y_pred),3))
print("F1: ",round(metrics.f1_score(y_test, y_pred),3))
Accuracy: 0.824 F1: 0.726
cm1 = confusion_matrix(y_test, y_pred)
cm1
array([[2127, 273],
[ 360, 840]], dtype=int64)
cm2 = confusion_matrix(y_test, y_pred,normalize='true')
cm2
array([[0.88625, 0.11375],
[0.3 , 0.7 ]])
disp = ConfusionMatrixDisplay(confusion_matrix=cm1,display_labels=lr.classes_)
disp.plot()
plt.show()
disp = ConfusionMatrixDisplay(confusion_matrix=cm2,display_labels=lr.classes_)
disp.plot()
plt.show()
feature_importance = lr.coef_[0][:10]
for i,v in enumerate(feature_importance):
print('Feature: ', list(cv.vocabulary_.keys())[list(cv.vocabulary_.values()).index(i)], 'Score: ', v)
Feature: 099 Score: 0.16957992320173373 Feature: 10 Score: 0.3260853171189287 Feature: 100 Score: 0.48778686815668726 Feature: 11 Score: -0.3878995778590289 Feature: 12 Score: -0.46282779306757443 Feature: 13 Score: 0.843277736340887 Feature: 14 Score: -0.6039936899557675 Feature: 15 Score: 0.918787327774838 Feature: 16 Score: 0.4645811567303819 Feature: 17 Score: 0.42150522228815623
feature_importance = lr.coef_[0]
sorted_idx = np.argsort(feature_importance)
#Top words for the positive class (negative sentiment):
top_10_pos_w = [list(cv.vocabulary_.keys())[list(cv.vocabulary_.values()).index(w)] for w in sorted_idx[range(-1,-11, -1)]]
print(top_10_pos_w)
['waste', 'delete', 'cardboard', 'sorry', 'depress', 'boring', '25', 'thin', 'ugh', 'weird']
fig = plt.figure(figsize=(10, 6))
ax = sns.barplot(x=top_10_pos_w, y=feature_importance[sorted_idx[range(-1,-11, -1)]])
plt.title("Most Important Words Used for Negative Sentiment",fontsize = 20)
x_locs,x_labels = plt.xticks()
plt.setp(x_labels, rotation = 40)
plt.ylabel('Feature Importance', fontsize = 12)
plt.xlabel('Word', fontsize = 12);
#Top words for the negative class (positive sentiment):
top_10_neg_w = [list(cv.vocabulary_.keys())[list(cv.vocabulary_.values()).index(w)] for w in sorted_idx[:10]]
print(top_10_neg_w)
['enjoyed', 'verne', 'enjoyable', 'shot', 'fun', 'thumb', 'hunter', 'thrill', 'loved', 'hot']
fig = plt.figure(figsize=(10, 6))
ax = sns.barplot(x=top_10_neg_w, y=feature_importance[sorted_idx[:10]])
plt.title("Most Important Words Used for Positive Sentiment",fontsize = 20)
x_locs,x_labels = plt.xticks()
plt.setp(x_labels, rotation = 40)
plt.ylabel('Feature Importance', fontsize = 12)
plt.xlabel('Word', fontsize = 12);
lr.classes_#negative class first, positive class next
array([0, 1], dtype=int64)
test_review = cv.transform(["I did not enjoy the book"])
p = lr.predict_proba(test_review)
s = lr.predict(test_review)
print("prob are:",p)
print("prediction are:",s)
prob are: [[0.78794775 0.21205225]] prediction are: [0]
#hyperparameter
pred_proba_df = pd.DataFrame(lr.predict_proba(test_feature_set))
threshold_list = [0.3,0.4,0.45,0.5]
for i in threshold_list:
print ('\n******** For i = {} ******'.format(i))
Y_test_pred = pred_proba_df.applymap(lambda x: 1 if x>i else 0)
test_f1 = round(metrics.f1_score(y_test, Y_test_pred.loc[:,1].values),3)
print('F1: {}'.format(test_f1))
******** For i = 0.3 ****** F1: 0.735 ******** For i = 0.4 ****** F1: 0.733 ******** For i = 0.45 ****** F1: 0.733 ******** For i = 0.5 ****** F1: 0.726